This post includes notes that I took from watching a presentation given by Eduardo Ariño de la, the Chief Data Scientist at Domino Data Lab. In his presentation, he introduces 23 visualizations and the appropriate scenarios to use them. You can find the presentation here, Video- 23 Visualizations and when to use them. My goal is to familiarize myself with these plots by writing the code using mainly ggplot and datasets that I am familiar with.
library(tidyverse)
Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ---------------------------------------------------------------------------------------------------
filter(): dplyr, stats
lag(): dplyr, stats
library(lubridate)
Attaching package: ‘lubridate’
The following object is masked from ‘package:base’:
date
library(readxl)
theme_set(theme_classic())
options(scipen=999)
global = read_excel("global_superstore.xls", sheet = 1, col_names = T)
DEFINEDNAME: 21 00 00 01 0b 00 00 00 01 00 00 00 00 00 00 0d 3b 02 00 00 00 5a c8 00 00 17 00
DEFINEDNAME: 8b 03 00 0f 02 00 00 00 00 00 00 00 00 00 00 5f 78 6c 66 6e 2e 4e 4f 52 4d 2e 44 49 53 54 1c 1d
DEFINEDNAME: 8b 03 00 0e 02 00 00 00 00 00 00 00 00 00 00 5f 78 6c 66 6e 2e 4e 4f 52 4d 2e 49 4e 56 1c 1d
DEFINEDNAME: 00 00 00 0b 07 00 00 00 00 00 00 00 00 00 00 4c 61 6e 67 75 61 67 65 5f 49 44 3a 01 00 00 00 04 00
DEFINEDNAME: 21 00 00 01 0b 00 00 00 01 00 00 00 00 00 00 0d 3b 02 00 00 00 5a c8 00 00 17 00
DEFINEDNAME: 8b 03 00 0f 02 00 00 00 00 00 00 00 00 00 00 5f 78 6c 66 6e 2e 4e 4f 52 4d 2e 44 49 53 54 1c 1d
DEFINEDNAME: 8b 03 00 0e 02 00 00 00 00 00 00 00 00 00 00 5f 78 6c 66 6e 2e 4e 4f 52 4d 2e 49 4e 56 1c 1d
DEFINEDNAME: 00 00 00 0b 07 00 00 00 00 00 00 00 00 00 00 4c 61 6e 67 75 61 67 65 5f 49 44 3a 01 00 00 00 04 00
DEFINEDNAME: 21 00 00 01 0b 00 00 00 01 00 00 00 00 00 00 0d 3b 02 00 00 00 5a c8 00 00 17 00
DEFINEDNAME: 8b 03 00 0f 02 00 00 00 00 00 00 00 00 00 00 5f 78 6c 66 6e 2e 4e 4f 52 4d 2e 44 49 53 54 1c 1d
DEFINEDNAME: 8b 03 00 0e 02 00 00 00 00 00 00 00 00 00 00 5f 78 6c 66 6e 2e 4e 4f 52 4d 2e 49 4e 56 1c 1d
DEFINEDNAME: 00 00 00 0b 07 00 00 00 00 00 00 00 00 00 00 4c 61 6e 67 75 61 67 65 5f 49 44 3a 01 00 00 00 04 00
DEFINEDNAME: 21 00 00 01 0b 00 00 00 01 00 00 00 00 00 00 0d 3b 02 00 00 00 5a c8 00 00 17 00
DEFINEDNAME: 8b 03 00 0f 02 00 00 00 00 00 00 00 00 00 00 5f 78 6c 66 6e 2e 4e 4f 52 4d 2e 44 49 53 54 1c 1d
DEFINEDNAME: 8b 03 00 0e 02 00 00 00 00 00 00 00 00 00 00 5f 78 6c 66 6e 2e 4e 4f 52 4d 2e 49 4e 56 1c 1d
DEFINEDNAME: 00 00 00 0b 07 00 00 00 00 00 00 00 00 00 00 4c 61 6e 67 75 61 67 65 5f 49 44 3a 01 00 00 00 04 00
Expecting numeric in [898, 12] got `05408`Expecting numeric in [1184, 12] got `05408`Expecting numeric in [3115, 12] got `05408`Expecting numeric in [4790, 12] got `05408`Expecting numeric in [6399, 12] got `05408`Expecting numeric in [23425, 12] got `05408`Expecting numeric in [26001, 12] got `05408`Expecting numeric in [43152, 12] got `05408`Expecting numeric in [45778, 12] got `05408`Expecting numeric in [50150, 12] got `05408`Expecting numeric in [50413, 12] got `05408`
# rename variables
global = rename(global, row_id = `Row ID`, order_id = `Order ID`, order_date = `Order Date`, ship_date = `Ship Date`, ship_mode = `Ship Mode`, customer_id = `Customer ID`, customer_name = `Customer Name`, segment = Segment, city = City, state = State, country = Country, postal_code = `Postal Code`, market = Market, region = Region, product_id = `Product ID`, category = Category, sub_category = `Sub-Category`, product_name = `Product Name`, sales = Sales, quantity = Quantity, discount = `Discount`, profit = `Profit`, shipping_cost = `Shipping Cost`, order_priority = `Order Priority`)
global = global %>%
mutate(weekday = wday(order_date, label = T),
day = day(order_date),
month = month(order_date, label = T),
year = year(order_date))
global
diverging_profit =
global %>%
mutate(gain_loss = ifelse(profit < 0, 'loss', 'gain')) %>%
select(order_date, region, profit, gain_loss) %>%
arrange(profit)
ggplot(diverging_profit, aes(x = region, y = profit, label = profit)) +
geom_bar(stat='identity', aes(fill=gain_loss), width=.8) +
labs(title = "Diverging Bar", subtitle = "Profit loss or gain from a fixed reference point of 0")+
coord_flip()+
scale_fill_manual(values = c("gain"="#1F77B4", "loss"="#FF7F0E"))
ggplot(diverging_profit, aes(x=region, y=profit, label=profit)) +
geom_point(stat='identity', aes(colour=gain_loss), size=2, alpha = 0.6) +
labs(title = "Diverging Dot Plot",
subtitle = "Profit loss or gain from a fixed reference point of 0") +
coord_flip() + scale_colour_manual(values = c("gain"="#1F77B4", "loss"="#FF7F0E"))
filter_2012 = global %>%
mutate(gain_loss = ifelse(profit > 0, 'gain', 'loss')) %>%
filter(order_date < "2011-06-01")
ggplot(filter_2012, aes(order_date, profit, fill = gain_loss )) +
geom_area() + labs(title = "Diverging Area Chart", subtitle = "Profit loss or gain from a fixed reference point of 0") + scale_fill_manual(values = c("gain"="#1F77B4", "loss"="#FF7F0E"))
ggplot(global, aes(x = sales, y = profit)) +
geom_point(aes(col = segment), alpha = 0.6) +
geom_smooth(method = "loess", se = F) +
labs(title = "Scatterplot with Smoothing Line Based on LOESS", subtitle = "Sales vs Profit") +
scale_colour_manual(values = c("#1F77B4","#FF7F0E", "#2CA02C"))
mpg
ggplot(mpg, aes(x = hwy, y = displ)) +
geom_point(colour = "#FF7F0E")+
geom_smooth(method = "loess", se = F) +
labs(title = "Scatterplot with Smoothing Line Based on LOESS", subtitle = "Engine Displacement vs Highway Miles Per Gallon")
ranked_region =
global %>%
select(region, sales)%>%
group_by(region) %>%
summarise(sum = sum(sales)) %>%
mutate(region = factor(region, levels = region[order(sum, decreasing = TRUE)]))
ggplot(ranked_region, aes(x = region, y = sum)) +
geom_bar(stat = "identity", width = .8, fill = "#1F77B4")+
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title = "Ranked Bar Chart", subtitle = "Sales by Region")
NA
ggplot(ranked_region, aes(x = region, y = sum)) +
geom_point(size = 3, col = "#1F77B4", alpha = 0.9)+
geom_segment(aes(x = region,
xend = region,
y = min(sum),
yend = max(sum)),
linetype = "dashed",
size = 0.1) +
labs(title = "Dot Plot Ranking Bar", subtitle = "Sales vs Region") +
coord_flip()
Distribution plots show how often values of a variable occur.
ggplot(mpg, aes(x = hwy)) +
geom_histogram(bins = 7,col = "black", fill = "#1F77B4") +
labs(title = "Histogram", subtitle = "Count of Highway Miles")
mtcars$cyl = as.factor(mtcars$cyl)
mtcars
ggplot(mtcars, aes(wt)) +
geom_density(aes(fill = factor(cyl)), alpha = 0.7) + xlim(0,6)+
labs(title="Density plot",
subtitle="Weight (1000 lbs) per Cylinder",
caption="Source: mtcars",
x="Weight",
fill="Cylinders") +
scale_fill_manual(values = c("#1F77B4", "#2CA02C","#FF7F0E"))
ggplot(mtcars, aes(cyl, qsec, group = cyl)) +
geom_boxplot(varwidth = T, fill = "#1F77B4") +
labs(title="Boxplot",
subtitle="1/4 mile time per Cylinder",
caption="Source: mtcars",
x="Cylinders")
ggplot(global, aes(sub_category)) +
geom_bar(aes(fill = category)) +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Composition",
subtitle="Counts of Sub-Category - Grouped by Category",
caption="Source: Superstore",
x="Sub-Category",
fill = "Category" ) +
scale_fill_manual(values = c("#1F77B4","#FF7F0E", "#2CA02C"))
NA
global
Stacked Column
ggplot(global, aes(market)) +
geom_bar(aes(fill = category)) +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Category Stacked - Bar Chart",
subtitle="Markets - Grouped by Category",
x = "Market",
fill = "Category",
caption="Source: Superstore") +
scale_fill_manual(values = c("#1F77B4","#FF7F0E", "#2CA02C"))
global_treemap = global %>%
filter(year == 2012, month == 'Jan') %>%
mutate(country = as.factor(country),
region = as.factor(region),
market = as.factor(market))
library(treemapify)
Loading required package: plyr
------------------------------------------------------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
------------------------------------------------------------------------------------------------------------------------------
Attaching package: ‘plyr’
The following object is masked from ‘package:lubridate’:
here
The following objects are masked from ‘package:dplyr’:
arrange, count, desc, failwith, id, mutate, rename, summarise, summarize
The following object is masked from ‘package:purrr’:
compact
Loading required package: reshape2
Attaching package: ‘reshape2’
The following object is masked from ‘package:tidyr’:
smiths
Package treemapify 0.2.2 has been loaded!
treeMapCoordinates <- treemapify(
global_treemap,
area = "sales",
fill = "profit",
group = "market")
treeMapPlot <- ggplotify(treeMapCoordinates)
print(treeMapPlot)
year_2012 =
global %>%
filter(year == 2012)
ggplot(year_2012, aes(order_date, sales)) +
geom_line(col = "#1F77B4") +
facet_wrap(~segment, nrow = 3) +
labs(title="Change Trends",
subtitle="Sales by segment",
x = "Date",
caption="Source: Superstore")
mtcars_matrix = data.matrix(mtcars)
library(superheat)
superheat(mtcars_matrix,
left.label.size = 0.3,
left.label.text.size = 3,
legend.text.size = 12,
padding = .1)